# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/netflix-shows/netflix_titles.csv
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
df = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')
from ydata_profiling import ProfileReport
/opt/conda/lib/python3.10/site-packages/numba/core/decorators.py:262: NumbaDeprecationWarning: numba.generated_jit is deprecated. Please see the documentation at: https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-generated-jit for more information and advice on a suitable replacement. warnings.warn(msg, NumbaDeprecationWarning) /opt/conda/lib/python3.10/site-packages/visions/backends/shared/nan_handling.py:51: NumbaDeprecationWarning: The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details. def hasna(x: np.ndarray) -> bool:
ProfileReport(df)
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
df.head()
| show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | s1 | Movie | Dick Johnson Is Dead | Kirsten Johnson | NaN | United States | September 25, 2021 | 2020 | PG-13 | 90 min | Documentaries | As her father nears the end of his life, filmm... |
| 1 | s2 | TV Show | Blood & Water | NaN | Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban... | South Africa | September 24, 2021 | 2021 | TV-MA | 2 Seasons | International TV Shows, TV Dramas, TV Mysteries | After crossing paths at a party, a Cape Town t... |
| 2 | s3 | TV Show | Ganglands | Julien Leclercq | Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi... | NaN | September 24, 2021 | 2021 | TV-MA | 1 Season | Crime TV Shows, International TV Shows, TV Act... | To protect his family from a powerful drug lor... |
| 3 | s4 | TV Show | Jailbirds New Orleans | NaN | NaN | NaN | September 24, 2021 | 2021 | TV-MA | 1 Season | Docuseries, Reality TV | Feuds, flirtations and toilet talk go down amo... |
| 4 | s5 | TV Show | Kota Factory | NaN | Mayur More, Jitendra Kumar, Ranjan Raj, Alam K... | India | September 24, 2021 | 2021 | TV-MA | 2 Seasons | International TV Shows, Romantic TV Shows, TV ... | In a city of coaching centers known to train I... |
df.isnull().sum()
show_id 0 type 0 title 0 director 2634 cast 825 country 831 date_added 10 release_year 0 rating 4 duration 3 listed_in 0 description 0 dtype: int64
df['rating'].unique()
array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
'TV-G', 'G', 'NC-17', '74 min', '84 min', '66 min', 'NR', nan,
'TV-Y7-FV', 'UR'], dtype=object)
# replacing missing country values with the most common country
mode_country = df['country'].mode()[0]
df['country'].fillna(mode_country, inplace=True)
ax = sns.countplot(x=df.type,palette='rocket')
ax.bar_label(ax.containers[0])
ax.set_title('Number of Movies and TV Shows on Netflix')
Text(0.5, 1.0, 'Number of Movies and TV Shows on Netflix')
# Group the data by 'country' and count the number of occurrences
country_counts = df['country'].value_counts()
# Select the top N countries for better visualization (adjust as needed)
top_countries = country_counts.head(10)
# Plotting the bar chart
plt.figure(figsize=(10, 6))
top_countries.plot(kind='bar')
plt.title('Amount of Content per Country')
plt.xlabel('Country')
plt.ylabel('Number of Shows/Movies')
plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for better readability
plt.tight_layout()
plt.show()
type_count = df['type'].value_counts()
# Plotting the bar chart
plt.figure(figsize=(5, 6))
type_count.plot(kind='bar')
plt.title('Most common type of content')
plt.xlabel('Country')
plt.ylabel('Number of Shows/Movies')
plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for better readability
plt.tight_layout()
plt.show()
# Create a new column 'genres' by splitting the 'listed_in' values
df['genres'] = df['listed_in'].str.split(', ')
# Flatten the list of genres
all_genres = [genre for sublist in df['genres'].dropna() for genre in sublist]
# Count the occurrences of each genre
genre_counts = pd.Series(all_genres).value_counts()
# Display the most common genre
most_common_genre = genre_counts.idxmax()
print(f"The most common genre for a movie or TV show is: {most_common_genre}")
The most common genre for a movie or TV show is: International Movies
# Create a new column 'genres' by splitting the 'listed_in' values
df['cast_list'] = df['cast'].str.split(', ')
# Flatten the list of genres
cast_list = [genre for sublist in df['cast_list'].dropna() for genre in sublist]
# Count the occurrences of each genre
cast_list_count = pd.Series(cast_list).value_counts()
most_casted_actor = cast_list_count.idxmax()
print(f"The most common genre for a movie or TV show is: {most_casted_actor}")
The most common genre for a movie or TV show is: Anupam Kher
df.columns
Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
'release_year', 'rating', 'duration', 'listed_in', 'description',
'genres', 'cast_list'],
dtype='object')
df['duration']
0 90 min
1 2 Seasons
2 1 Season
3 1 Season
4 2 Seasons
...
8802 158 min
8803 2 Seasons
8804 88 min
8805 88 min
8806 111 min
Name: duration, Length: 8807, dtype: object
tv_shows = df[df['type'] == 'TV Show']
movies = df[df['type'] == 'Movies']
tv_shows['duration'] = pd.to_numeric(df['duration'].str.extract('(\d+)', expand=False), errors='coerce')
/tmp/ipykernel_138/625843817.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
tv_shows['duration'] = pd.to_numeric(df['duration'].str.extract('(\d+)', expand=False), errors='coerce')
longest_running_show = tv_shows.loc[tv_shows['duration'].idxmax()]
print(f"Longest running show: {longest_running_show['title']}")
Longest running show: Grey's Anatomy
movies['duration'] = pd.to_numeric(df['duration'].str.extract('(\d+)', expand=False), errors='coerce')
fig, ax = plt.subplots(figsize =(12,8))
ax = sns.countplot(y=df.rating,order = df.rating.value_counts().index,palette='rocket')
ax.bar_label(ax.containers[0])
ax.set_title('Number of ratings')
Text(0.5, 1.0, 'Number of ratings')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8807 entries, 0 to 8806 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 show_id 8807 non-null object 1 type 8807 non-null object 2 title 8807 non-null object 3 director 6173 non-null object 4 cast 7982 non-null object 5 country 8807 non-null object 6 date_added 8797 non-null object 7 release_year 8807 non-null int64 8 rating 8803 non-null object 9 duration 8804 non-null object 10 listed_in 8807 non-null object 11 description 8807 non-null object 12 genres 8807 non-null object 13 cast_list 7982 non-null object dtypes: int64(1), object(13) memory usage: 963.4+ KB
df['date_added'] = pd.to_datetime(df['date_added'],format = 'mixed')
df['month_added'] = df['date_added'].dt.month
df['year_added'] = df['date_added'].dt.year
df.head(3)
| show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | genres | cast_list | month_added | year_added | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | s1 | Movie | Dick Johnson Is Dead | Kirsten Johnson | NaN | United States | 2021-09-25 | 2020 | PG-13 | 90 min | Documentaries | As her father nears the end of his life, filmm... | [Documentaries] | NaN | 9.0 | 2021.0 |
| 1 | s2 | TV Show | Blood & Water | NaN | Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban... | South Africa | 2021-09-24 | 2021 | TV-MA | 2 Seasons | International TV Shows, TV Dramas, TV Mysteries | After crossing paths at a party, a Cape Town t... | [International TV Shows, TV Dramas, TV Mysteries] | [Ama Qamata, Khosi Ngema, Gail Mabalane, Thaba... | 9.0 | 2021.0 |
| 2 | s3 | TV Show | Ganglands | Julien Leclercq | Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi... | United States | 2021-09-24 | 2021 | TV-MA | 1 Season | Crime TV Shows, International TV Shows, TV Act... | To protect his family from a powerful drug lor... | [Crime TV Shows, International TV Shows, TV Ac... | [Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nab... | 9.0 | 2021.0 |
df.rating.value_counts()
df.rating.replace(['74 min', '84 min', '66 min'], 'TV-MA',inplace=True)
df.loc[[5989], ['rating']] = 'TV-PG'
df.loc[[6827], ['rating']] = 'TV-14'
df.loc[[7312], ['rating']] = 'TV-PG'
df.loc[[7537], ['rating']] = 'PG-13'
df.loc[df.rating.isin(['TV-Y7-FV']), ['rating']] = 'TV-Y7'
df.loc[df.rating.isin(['TV-G']), ['rating']] = 'G'
df.loc[df.rating.isin(['TV-PG']), ['rating']] = 'PG'
df.loc[df.rating.isin(['TV-MA']), ['rating']] = 'R'
df.loc[df.rating.isin(['NR', 'UR']), ['rating']] = 'nrur'
fig, ax = plt.subplots(figsize =(12,8))
ax = sns.countplot(y=df.rating,order = df.rating.value_counts().index,palette='rocket')
ax.bar_label(ax.containers[0])
ax.set_title('Number of ratings')
Text(0.5, 1.0, 'Number of ratings')
#create a new column to catergory kid, teen, adult
kid = df[df.rating.isin(['TV-Y','TV-Y7','G','PG'])].index
teen = df[df.rating.isin(['PG-13','TV-14'])].index
adult = df[df.rating.isin(['R','NC-17','nrur'])].index
df.loc[kid, 'maturity_level'] = 'kid'
df.loc[teen, 'maturity_level'] = 'teen'
df.loc[adult, 'maturity_level'] = 'adult'
fig, ax = plt.subplots(figsize =(12,8))
ax = sns.countplot(y=df.maturity_level,palette='rocket')
ax.bar_label(ax.containers[0])
ax.set_title('Number of movies')
Text(0.5, 1.0, 'Number of movies')
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from fuzzywuzzy import process
# Function to preprocess and create a content column
def preprocess_data(df):
# replacing missing country values with the most common country
df.rating.replace(['74 min', '84 min', '66 min'], 'TV-MA',inplace=True)
#replacing with
mode_country = df['country'].mode()[0]
df['country'].fillna(mode_country, inplace=True)
# replacing correct rating
df.loc[[5989], ['rating']] = 'TV-PG'
df.loc[[6827], ['rating']] = 'TV-14'
df.loc[[7312], ['rating']] = 'TV-PG'
df.loc[[7537], ['rating']] = 'PG-13'
#Designing according to the Netflix algorithm
df.loc[df.rating.isin(['TV-Y7-FV']), ['rating']] = 'TV-Y7'
df.loc[df.rating.isin(['TV-G']), ['rating']] = 'G'
df.loc[df.rating.isin(['TV-PG']), ['rating']] = 'PG'
df.loc[df.rating.isin(['TV-MA']), ['rating']] = 'R'
df.loc[df.rating.isin(['NR', 'UR']), ['rating']] = 'nrur'
#Setting Maturity Levels for kids, Teens, Adults
kid = df[df.rating.isin(['TV-Y','TV-Y7','G','PG'])].index
teen = df[df.rating.isin(['PG-13','TV-14'])].index
adult = df[df.rating.isin(['R','NC-17','nrur'])].index
df.loc[kid, 'maturity_level'] = 'kid'
df.loc[teen, 'maturity_level'] = 'teen'
df.loc[adult, 'maturity_level'] = 'adult'
# combining all the contents making a big string of knowledge
df['content'] = df['title'].astype(str) + ' ' + df['director'].astype(str) + ' ' + df['cast'].astype(str) + ' ' + df['country'].astype(str) + ' ' + df['rating'].astype(str) + df['duration'].astype(str) + ' ' + df['listed_in'].astype(str) + ' ' + df['description'].astype(str) + ' ' + df['maturity_level'].astype(str)
df['content'] = df['content'].fillna('')
# Function to create TF-IDF matrix
def create_tfidf_matrix(df):
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['content'])
return tfidf_matrix
# Function to create Bag of Words (BoW) matrix
def create_bow_matrix(df):
count_vectorizer = CountVectorizer()
bow_matrix = count_vectorizer.fit_transform(df['content'])
return bow_matrix
# Function to compute TF-IDF cosine similarity
def tfidf_cosine_similarity(tfidf_matrix):
cosine_sim = cosine_similarity(tfidf_matrix)
return cosine_sim
# Function to compute BoW cosine similarity
def bow_cosine_similarity(bow_matrix):
cosine_sim = cosine_similarity(bow_matrix)
return cosine_sim
# Function to train Word2Vec model
def train_word2vec(df):
df['tokenized_content'] = df['content'].apply(simple_preprocess)
model = Word2Vec(vector_size=100, window=5, min_count=1, workers=4)
model.build_vocab(df['tokenized_content'])
model.train(df['tokenized_content'], total_examples=model.corpus_count, epochs=10)
return model
# Function to average word vectors for a text
def average_word_vectors(words, model, vocabulary, num_features):
feature_vector = np.zeros((num_features,), dtype="float64")
nwords = 0.
for word in words:
if word in vocabulary:
nwords = nwords + 1.
feature_vector = np.add(feature_vector, model.wv[word])
if nwords:
feature_vector = np.divide(feature_vector, nwords)
return feature_vector
def averaged_word_vectorizer(corpus, model, num_features):
vocabulary = set(model.wv.index_to_key)
features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features) for tokenized_sentence in corpus]
return np.array(features)
# Function to compute Word2Vec-based similarity
def word2vec_similarity(user_movie, df):
user_movie = find_similar_movies_fuzzy(df, user_movie)
df['tokenized_content'] = df['content'].apply(simple_preprocess)
model = Word2Vec(vector_size=100, window=5, min_count=1, workers=4)
model.build_vocab(df['tokenized_content'])
model.train(df['tokenized_content'], total_examples=model.corpus_count, epochs=10)
movie_index = df[df['title'] == user_movie].index[0]
w2v_feature_array = averaged_word_vectorizer(corpus=df['tokenized_content'], model=model, num_features=100)
# Compute the cosine similarities between the user movie and all other movies
user_movie_vector = w2v_feature_array[movie_index].reshape(1, -1)
similarity_scores = cosine_similarity(user_movie_vector, w2v_feature_array)
# Get the top 10 most similar movies
similar_movies = list(enumerate(similarity_scores[0]))
sorted_similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)[1:20]
# Print the top 10 similar movies
for i, score in sorted_similar_movies:
print("{}: {}".format(i, df.loc[i, 'title']))
# Function to compute BoW-based similarity
def bow_similarity(user_movie, df, bow_matrix):
user_movie = find_similar_movies_fuzzy(df, user_movie)
movie_index = df[df['title'] == user_movie].index[0]
similarity_scores = bow_cosine_similarity(bow_matrix)
similar_movies = list(enumerate(similarity_scores[movie_index]))
sorted_similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)[1:20]
for i, score in sorted_similar_movies:
print("{}: {}".format(i, df.loc[i, 'title']))
# Function to compute TF-IDF-based similarity
def tfidf_similarity(user_movie, df, tfidf_matrix):
user_movie = find_similar_movies_fuzzy(df, user_movie)
movie_index = df[df['title'] == user_movie].index[0]
similarity_scores = tfidf_cosine_similarity(tfidf_matrix)
similar_movies = list(enumerate(similarity_scores[movie_index]))
sorted_similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)[1:20]
for i, score in sorted_similar_movies:
print("{}: {}".format(i, df.loc[i, 'title']))
# Function to find similar movies using fuzzy string matching
def find_similar_movies_fuzzy(df, movie_name):
top_movies = process.extract(movie_name, df['title'], limit=5)
return top_movies[0][0]
def similar_movies_fuzzy(df, movie_name):
top_movies = process.extract(movie_name, df['title'], limit=5)
print("Advanced Search and similar Alternatives")
for movie, score, index in top_movies:
print(f"Movie: {movie}, Similarity Score: {score}")
# Main program
if __name__ == "__main__":
# Load your DataFrame 'df' with movie data here
df = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')
# Preprocess data and create the 'content' column
preprocess_data(df)
# Create the TF-IDF matrix and BoW matrix
tfidf_matrix = create_tfidf_matrix(df)
bow_matrix = create_bow_matrix(df)
# Train Word2Vec model
#word2vec_model = train_word2vec(df)
# Get user input
user_movie = input("Enter a movie title: ")
similar_movies_fuzzy(df, user_movie)
print("\nSimilar Movies (TF-IDF Cosine Similarity):")
tfidf_similarity(user_movie, df, tfidf_matrix)
print("\nSimilar Movies (BoW Cosine Similarity):")
bow_similarity(user_movie, df, bow_matrix)
print("\nSimilar Movies (Word2Vec Similarity):")
similarity_scores = word2vec_similarity(user_movie, df)
Advanced Search and similar Alternatives Movie: Superbad, Similarity Score: 100 Movie: AdĂș, Similarity Score: 90 Movie: P, Similarity Score: 90 Movie: Superstar, Similarity Score: 71 Movie: Esperando la carroza, Similarity Score: 68 Similar Movies (TF-IDF Cosine Similarity): 4938: Seth Rogen's Hilarity for Charity 178: The Interview 3305: Seth Meyers: Lobby Baby 5900: Wet Hot American Summer 2010: How to Train Your Dragon 2 346: Pineapple Express 5540: Win It All 5035: Dragons: Race to the Edge 6710: Evan Almighty 145: House Party 2 1833: ParaNorman 7515: Movie 43 4289: Dragons: Dawn of the Dragon Racers 2454: The Disaster Artist 4718: Like Father 5133: Trolls Holiday Special 6414: Can't Hardly Wait 4629: Maniac 1443: QB1: Beyond the Lights Similar Movies (BoW Cosine Similarity): 145: House Party 2 7498: Monster High: Haunted 7121: Jay and Silent Bob Strike Back 5833: Brahman Naman 8401: The Longest Yard 1830: What Did I Mess 3314: 100 Things to do Before High School 648: Too Hot to Handle 8630: Trip to Bhangarh: Asia's Most Haunted Place 3912: Generation Iron 3 6563: Dare to Be Wild 4050: Weapon of Choice 8804: Zombieland 3629: Otherhood 7585: Nightcrawler 8714: Welcome to Monster High: The Origin Story 7715: Patron Mutlu Son Istiyor 8624: Tremors 4: The Legend Begins 7046: I Fine... Thank You... Love You Similar Movies (Word2Vec Similarity): 6533: Cool Hand Luke 8608: Total Frat Movie 622: Lying and Stealing 956: Zack and Miri Make a Porno 1510: The Con Is On 5329: Chocolate City: Vegas Strip 6321: Black & Privileged: Volume 1 7579: New York Minute 7369: Mad Money 48: Training Day 6637: Doubt 142: Freedom Writers 630: Killing Them Softly 1367: Hell Fest 1034: Synchronic 6896: Green Room 5419: You Get Me 8046: Sniper: Special Ops 5830: Rebirth